In [1]:
    
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
    
In [2]:
    
# convert biom table to tab delimited file in bash with 'taxonomy' information remained
# biom convert -i reference-hit.tax.biom -o table.from_biom.txt --to-tsv --header-key 'taxonomy'
    
In [3]:
    
bt = pd.read_csv('../../data/table.from_biom.txt', sep='\t', index_col='#OTU ID', skiprows=1)
    
In [4]:
    
print(bt.shape)
bt.head()
    
    
    Out[4]:
In [5]:
    
print(bt.taxonomy.str.len().min())
print(bt.taxonomy.str.len().max())
    
    
In [6]:
    
bt.to_csv('../data/biomtable.txt', sep='\t')
    
In [7]:
    
mf = pd.read_csv('../data/mapping_MrOS.txt', sep='\t', dtype=str, index_col='#SampleID')
    
In [8]:
    
print(mf.shape)
mf.head()
    
    
    Out[8]:
In [9]:
    
vars_cat = np.array(['BarcodeSequence', 'LinkerPrimerSequence', 'Experiment_Design_Description',
             'Library_Construction_Protocol', 'Linker', 'Platform', 'Center_Name', 'Center_Project', 'Instrument_Model',
             'Title', 'Anonymized_Name', 'Scientific_Name', 'Taxon_ID', 'Sample_Type', 'Geo_Loc_Name', 'Elevation', 'Env_Biome',
             'Env_Feature', 'Env_Material', 'Env_Package', 'Collection_Timestamp', 'DNA_Extracted', 'Physical_Specimen_Location',
             'Physical_Specimen_Remaining', 'Age_Units', 'Host_Subject_ID', 'Host_Taxid','Host_Scientific_Name', 'Host_Common_Name',
             'Life_Stage', 'Sex', 'Height_Units', 'Weight_Units', 'Body_Habitat', 'Body_Site', 'Body_Product', 'GIERACE', 'SITE',
             'TUDRAMT', 'TURSMOKE', 'M1ADEPR', 'M1VITMND', 'M1ANTIB', 'M1PROBI', 'OHSEAS', 'VDstatus', 'Description',
             'OHV1D2CT', 'OHVD2CT'])
vars_cts = np.array(['Latitude', 'Longitude', 'Age', 'Height', 'Weight', 'BMI', 'PASCORE', 'DTVITD', 
             'OHV1D3', 'OHV24D3', 'OHVD3', 'OHVD2', 'OHV1D2',  'OHVDTOT', 'OHV1DTOT'])
    
In [10]:
    
# convert vars_cts to numeric and vars_cat to factors
df = mf.copy()
df[vars_cts] = df[vars_cts].apply(pd.to_numeric, errors='coerce')
df[vars_cat] = df[vars_cat].apply(lambda x: x.astype('category'))
    
In [11]:
    
# convert all pg/ml to ng/ml note: 1 ng/ml = 1000 pg/ml
df.OHV1D3 = df.OHV1D3/1000
df.OHV1D2 = df.OHV1D2/1000
df.OHV1DTOT = df.OHV1DTOT/1000
    
In [12]:
    
#df.M1ANTIB.value_counts()
    
In [13]:
    
# df['ratio_activation'] = df.OHV1D3/(df.OHVD3*1000) # pg/ml vs. ng/ml
# df['ratio_catabolism'] = df.OHV24D3/df.OHVD3 # both ng/ml
df['ratio_activation'] = df.OHV1D3/df.OHVD3
df['ratio_catabolism'] = df.OHV24D3/df.OHVD3 
vars_cts = np.append(vars_cts, ['ratio_activation', 'ratio_catabolism'])
    
In [14]:
    
df[vars_cts].describe()
    
    Out[14]:
In [15]:
    
df[vars_cat].describe()
    
    Out[15]:
In [16]:
    
df[vars_cts].isnull().sum()
    
    Out[16]:
In [17]:
    
# for i in range(len(vars_cat)):
#     print(df[vars_cat[i]].value_counts())
    
In [18]:
    
# check
print(mf.shape)
print(df.shape)
    
    
In [19]:
    
df.to_csv('../data/mapping_cleaned_MrOS.txt', sep= '\t', index=True)
    
In [ ]: